Modelling eel abundance in the Vilaine bassin

0. Setup

Load dependencies.

library(renv)
renv::load()
renv::restore() # Install missing librairies.
renv::status() # Check the project state.
library(zen4R)
library(here)

Download data from Zenodo. The data file is relatively heavy (~800Mb), this operation can take a few minutes.

doi <- "10.5281/zenodo.17962542"
dest_dir <- here("data")
if (!dir.exists(dest_dir)) dir.create(dest_dir, recursive = TRUE)
download_zenodo(doi, path = dest_dir)
unzip(here(dest_dir, "miste_data.zip"), exdir = dest_dir)

If everything went well, the data should have been downloaded in data/zenodo/.

1. River data

First, we focus on the river geographic data. Let’s load it.

base::load(here(dest_dir, "zenodo", "processed_data", "reseaux_hydrographiques.rda"))

For now, we will focus on the Vilaine bassin. We can view it using sfnetworks.

library(sf)
library(sfnetworks)
net <- as_sfnetwork(rht_vilaine)
plot(net, cex = 0.5)

Caution

We can see that the hydrographic network is not entirely connected. For example, a small portion in the south west is disconnected from the rest of the bassin. We should keep this in mind for later, as it could bias our statistical model.

We want to check that the network data is valid. In particular, we want to verify that there is no duplicate in the nodes, or edges data.

nodes <- net |>
    activate("nodes") |>
    st_as_sf()
sum(duplicated(nodes))
[1] 0
library(dplyr)
library(tidyr)

edges <- net |>
    activate("edges") |>
    st_as_sf()
sum(duplicated(select(edges, c("from", "to"))))
[1] 0

We see that there is no duplication for the Vilaine network. We can proceed to the next step.

2. Environmental data

2.1 Load data

base::load(here(dest_dir, "zenodo", "processed_data", "poissons_env_temp.RData"))

env
# A tibble: 5,777 × 14
   sta_id pop_id ope_id ope_date            annee distance_mer altitude
    <int>  <int>  <int> <dttm>              <dbl>        <int>    <int>
 1   9655  37720  22622 1996-09-24 10:09:00  1996           NA      305
 2   9225  35420  16755 1998-06-19 09:30:00  1998           NA       80
 3   9622  37544  16012 2000-05-22 14:45:00  2000          328       98
 4   9762  38357  17091 2006-09-28 14:00:00  2006          216       31
 5   9384  36157  17349 2006-06-21 14:30:00  2006          482      140
 6   9712  38066  17621 2016-09-08 10:30:00  2016          400      130
 7   9454  36580  17379 2010-09-14 15:00:00  2010          500      124
 8   9520  36941  17079 2007-06-14 14:30:00  2007          224       37
 9   9381  36135  17154 2006-06-20 00:00:00  2006          387      100
10   9284  35724  17596 2016-09-29 10:00:00  2016          212       32
# ℹ 5,767 more rows
# ℹ 7 more variables: surface_bv <dbl>, distance_source <dbl>, largeur <dbl>,
#   pente <dbl>, profondeur <dbl>, temp_juillet <dbl>, temp_janvier <dbl>

Let’s rename column for clarity.

env <- env |> rename(year = annee)

Station geographical coordinates are stored in points_geo_<river>. We visualize them over the hydrographic network.

plot(net, col = "black", cex = 0.5)
plot(points_geo_vilaine, col = "red", add = TRUE, pch = 16, cex = 0.5)

2.2. Visualisation of altitude data

To begin with, let’s focus on a single environment variable: the altitude.

d_env <- env |>
    select(c("sta_id", "pop_id", "altitude")) |>
    distinct()
d_env |> filter(is.na(altitude))
# A tibble: 2 × 3
  sta_id pop_id altitude
   <int>  <int>    <int>
1   8853  33379       NA
2  13994  51300       NA

We see that we have missing values for two stations: 8853 and 13994. Next, we want to relate altitude values to geographical coordinates.

pts_vilaine <- points_geo_vilaine |>
    select(c("pop_id", "geometry")) |>
    left_join(d_env, by = "pop_id") |>
    filter(!is.na(altitude))
pts_vilaine
Simple feature collection with 37 features and 3 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: -2.965245 ymin: 47.46923 xmax: -1.051178 ymax: 48.3493
Geodetic CRS:  WGS 84
First 10 features:
   pop_id sta_id altitude                   geometry
1   46885  12253       25 POINT (-2.429369 47.71654)
2   46166  11854      176 POINT (-2.965245 48.32017)
3   47601  12434       11 POINT (-1.796815 47.46923)
4   46671  12118        9 POINT (-2.332695 47.76529)
5   47659  12447        3  POINT (-2.142875 47.5878)
6   47613  12436        5 POINT (-1.867647 47.70129)
7   46484  12007       41  POINT (-2.35679 48.01553)
8   47460  12407       48 POINT (-1.410521 47.71718)
9   47677  12451       16 POINT (-2.563233 47.59471)
10  47326  12380       30 POINT (-1.562385 48.03841)

We see that the altitude is not measured on every sampling points, but only in 37 locations. Let’s plot them.

library(ggplot2)
set_theme(theme_minimal())
options(
    ggplot2.continuous.colour = "viridis",
    ggplot2.continuous.fill = "viridis"
)

ggplot() +
    geom_sf(data = points_geo_vilaine, color = "grey80", alpha = 0.5) +
    geom_sf(data = pts_vilaine, aes(color = altitude), size = 5) +
    labs(
        color = "Altitude (m)"
    )

We see in grey every location were we have sample of fish abundances, and in colour where we have altitude data. We have to interpolate altitude data for missing locations. The simplest way to do it is to the “nearest neighbour” interpolation.

2.3. Interpolation of altitude data

We first build proximity polygons with the terra::voronoi function.

library(terra)

d_elev <- select(pts_vilaine, c("geometry", "altitude"))
d_elev.vect <- terra::vect(d_elev)
v <- voronoi(d_elev.vect)
plot(v, "altitude")
points(d_elev.vect)

v.sf <- st_as_sf(v)
pred <- st_intersection(v.sf, points_geo_vilaine)
d_elev.nn <- select(pred, c("altitude", "pop_id", "geometry"))

ggplot() +
    geom_sf(data = pred, aes(color = altitude)) +
    geom_sf(data = pts_vilaine, aes(color = altitude), size = 5) +
    labs(
        color = "Altitude (m)"
    )

Predicted altitudes seem consistent with the nearest neighbour interpolation.

2.4. Cross-validation of altitude interpolation

Now, that we have a prediction pipeline working we will estimate its performance using cross-validation. To do so, we will do a LOOCV (Leave-One-Out Cross-Validation).

n <- nrow(d_elev)
preds <- numeric(n) # Vector filled with 0s.
for (i in 1:n) {
    train <- d_elev[-i, ]
    test <- d_elev[i, ]
    v_train <- voronoi(terra::vect(train))
    preds[i] <- st_intersection(st_as_sf(v_train), test)$altitude
}
d_elev$altitude.loo <- preds

ggplot(d_elev, aes(x = altitude, y = altitude.loo)) +
    geom_point() +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
    labs(
        x = "Observed altitude (m)",
        y = "Predicted altitude (m)"
    )

rmse <- sqrt(mean((d_elev$altitude - d_elev$altitude.loo)^2))
rmse
[1] 18.74797

3. Abundance data

3.1. Load data

catches <- catches |>
    rename(
        species = esp_code_alternatif,
        year = annee,
        abundance = effectif
    )
catches
# A tibble: 69,823 × 6
   sta_id pop_id ope_id  year species abundance
    <int>  <int>  <int> <dbl> <chr>       <int>
 1   8530  31870  19943  2017 TRF           159
 2   8530  31870  19943  2017 VAI            82
 3   8530  31870  19944  2011 TRF           217
 4   8530  31870  19944  2011 VAI             7
 5   8530  31870  19945  2009 TRF           232
 6   8530  31870  19945  2009 VAI             9
 7   8530  31870  19946  2007 TRF           213
 8   8530  31870  19946  2007 VAI            36
 9   8530  31870  83029  2019 TRF           126
10   8530  31870  83029  2019 VAI           104
# ℹ 69,813 more rows
species_names <- unique(catches$species)

Next, let’s add zeros where species are not found.

catches <- catches |>
    pivot_wider(
        names_from = species,
        values_from = abundance,
        values_fill = list(abundance = 0)
    ) |>
    pivot_longer(
        cols = species_names,
        names_to = "species",
        values_to = "abundance"
    )

catches
# A tibble: 450,606 × 6
   sta_id pop_id ope_id  year species abundance
    <int>  <int>  <int> <dbl> <chr>       <int>
 1   8530  31870  19943  2017 TRF           159
 2   8530  31870  19943  2017 VAI            82
 3   8530  31870  19943  2017 LOF             0
 4   8530  31870  19943  2017 CHA             0
 5   8530  31870  19943  2017 SDF             0
 6   8530  31870  19943  2017 PFL             0
 7   8530  31870  19943  2017 BAF             0
 8   8530  31870  19943  2017 CHE             0
 9   8530  31870  19943  2017 GOU             0
10   8530  31870  19943  2017 OBR             0
# ℹ 450,596 more rows

3.2. Preliminary plots

Let’s plot abundance histogram for each species. As most species abundance are dominated by zeros, because of absence data, we plot log(abundance + 1) instead of raw abundances. This also implies that we will need to use zero-inflated distributions in our statistical model to account for this absence data.

catches |>
    ggplot(aes(x = abundance + 1)) + # add 1 to avoid log(0)
    geom_histogram(binwidth = 0.5) +
    scale_x_log10() +
    facet_wrap(~species)

Let’s plot abundance time series for the 10 most abundant species.

n_species <- 10
common_species <- catches |>
    group_by(species) |>
    summarise(mean_abundance = mean(abundance)) |>
    arrange(desc(mean_abundance)) |>
    slice(1:n_species) |>
    pull(species)

catches |>
    filter(species %in% common_species) |>
    group_by(year, species) |>
    summarise(mean_abundance = mean(abundance)) |>
    ggplot(aes(x = year, y = mean_abundance, colour = species)) +
    geom_line()

There is no clear trend, but we can’t say much at this point because trend may be masked by sampling efforts and environmental covariates.

Note

Here it would be interesting to plot the number of operations per year to see the evolution of the sampling effort.

d_eel <- catches |> filter(species == "ANG")
d_eel.avg <- d_eel |>
    group_by(pop_id) |>
    summarise(abundance_mean = mean(abundance))
data <- inner_join(d_elev.nn, d_eel.avg, by = "pop_id")

ggplot(data, aes(x = altitude, y = abundance_mean)) +
    geom_point() +
    labs(
        x = "Altitude (m)",
        y = "Eel abundance"
    )

We clearly observe the expected trend of eel abundance with altitude.

Next, we want to model the effect of altitude, while accounting for time using INLA.

4. Statistical models

Data is ready to model eel abundances in space, time and with altitude. In this section, we will build increasingly complex models.

Before doing anything fancy, let’s load INLA, and scale our predictor variable (altitude) as it is good practice.

library(INLA)

data <- inner_join(d_eel, d_elev.nn, by = "pop_id")
data$altitude_s <- scale(data$altitude)

4.1. Abundance distribution and overdisperion

Here, we focus on the choice of the distribution of our target variable: eel abundance. We will focus particularly on overdispersion, as we know that eel distribution is zero-inflated due to absence data. For the following model, we add a random effect for stations and year. We will discuss in further details these points in following sections.

4.1.1. Poisson model

We begin with a poisson GLM.

m.poisson <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "iid") +
        f(pop_id, model = "iid"),
    family = "poisson",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE, dic = TRUE)
)
summary(m.poisson)
Time used:
    Pre = 0.753, Running = 0.273, Post = 0.118, Total = 1.14 
Fixed effects:
              mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)  1.921 0.165      1.594    1.922      2.244  1.922   0
altitude_s  -1.404 0.169     -1.739   -1.403     -1.075 -1.403   0

Random effects:
  Name    Model
    year IID model
   pop_id IID model

Model hyperparameters:
                      mean    sd 0.025quant 0.5quant 0.975quant  mode
Precision for year   10.86 2.719      6.395    10.57      17.03 10.03
Precision for pop_id  1.26 0.331      0.723     1.22       2.02  1.15

Deviance Information Criterion (DIC) ...............: 4552.01
Deviance Information Criterion (DIC, saturated) ....: 2685.27
Effective number of parameters .....................: -737.88

Watanabe-Akaike information criterion (WAIC) ...: 10992.89
Effective number of parameters .................: 2810.03

Marginal log-Likelihood:  -3209.39 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

We can plot the distribution of the fixed effect (altitude).

marginal.altitude <- m.poisson$marginals.fixed$altitude_s
ggplot(as.data.frame(marginal.altitude)) +
    geom_line(aes(x = x, y = y)) +
    labs(
        x = "Altitude effect size",
        y = "Probability"
    )

We can compute the highest posterior density (HDP) credible interval.

inla.hpdmarginal(0.97, m.poisson$marginals.fixed$altitude_s)
                 low      high
level:0.97 -1.774432 -1.037384

We can also sample from the posterior distribution.

m.poisson.samples <- inla.posterior.sample(100, m.poisson)

We represent the sampled effect size of altitude against the marginal distribution obtained before.

fun <- function(...) {
    altitude_s
}
altitude.sample <- inla.posterior.sample.eval(fun, m.poisson.samples)
tab <- data.frame(altitude = altitude.sample[1, ])
ggplot(tab, aes(x = altitude)) +
    geom_histogram(aes(y = after_stat(density)), bins = 18) +
    geom_line(data = as.data.frame(marginal.altitude), aes(x = x, y = y)) +
    labs(
        x = "Altitude effect size",
        y = "Density"
    )

Next, we want to simulate data from the model. This step is important as it allows us to see if our model makes sense or not. In particular, we identify where our model fails and improve it.

alt_vals <- seq(-1, 3, length.out = 100)
y.sample <- inla.posterior.sample.eval(
    fun = function(...) {
        mu <- exp(Intercept + altitude_s * alt_vals)
        abundance <- rpois(length(mu), mu)
    },
    samples = m.poisson.samples
)

library(bayestestR)
y.sum <- apply(y.sample, 1, function(x) {
    y.hdi <- hdi(x, ci = 0.89)
    c(mean = mean(x), hdi_low = y.hdi$CI_low, hdi_high = y.hdi$CI_high)
})
data.sim <- data.frame(
    altitude_s = alt_vals,
    t(y.sum)
)

ggplot(data.sim, aes(x = altitude_s, y = mean)) +
    # geom_point(data = data, aes(x = altitude_s, y = abundance), colour = "grey80") +
    geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
    geom_line(colour = "steelblue") +
    labs(
        x = "Scaled altitude",
        y = "Eel abundance"
    )

Next, we want to plot actual data against this prediction. Because we have not included spatial and temporal random effects while sampling the posterior, we are making predictions for an average site and year. Therefore, we want to average abundance data over site and year.

data.avg <- data |>
    group_by(altitude_s) |>
    summarise(
        abundance_avg = mean(abundance),
        abundance_var = var(abundance)
    )
data.avg
# A tibble: 32 × 3
   altitude_s[,1] abundance_avg abundance_var
            <dbl>         <dbl>         <dbl>
 1         -0.979         13.0           52.4
 2         -0.954         35.1          416. 
 3         -0.929         50.2         1692. 
 4         -0.904         30.3          235. 
 5         -0.830         43.1         2204. 
 6         -0.780         50.1         1004. 
 7         -0.705         37.6          350. 
 8         -0.680         42.4          624. 
 9         -0.655         26.6          332. 
10         -0.630          9.44          15.3
# ℹ 22 more rows
ggplot(data.sim, aes(x = altitude_s, y = mean)) +
    geom_point(data = data.avg, aes(x = altitude_s, y = abundance_avg), colour = "grey80") +
    geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
    geom_line(colour = "steelblue") +
    labs(
        x = "Scaled altitude",
        y = "Eel abundance"
    )

We see that our model capture the clear decreasing trend of eel abundances with altitude.

Let’s investigate overdispersion with a posterior predictive check.

y_pred_avg <- apply(y.sample, 1, mean)
y_pred_var <- apply(y.sample, 1, var)
pred_df <- data.frame(
    abundance_avg = y_pred_avg,
    abundance_var = y_pred_var
)
pred_df$type <- "Posterior predictive"
pred_df$altitude_s <- alt_vals
data.avg$type <- "Observed"
plot_df <- rbind(pred_df, data.avg)

ggplot(plot_df, aes(x = abundance_avg, y = abundance_var, colour = type)) +
    geom_point() +
    geom_abline(slope = 1, intercept = 1) +
    labs(
        x = "Observed mean abundance",
        y = "Observed variance in abundace"
    )

We see clearly that observed data is overdispersed (variance >> mean) compared to the data generated by our model. This suggest that the poisson distribution is not suited for this task.

4.1.2. Negative binomial model

To account for overdispersion, we will test the negative binomial distribution. Basically, we will repeat the previous section but with a model where poisson distribution is replaced with a negative binomial one.

m.nbinom <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "iid") +
        f(pop_id, model = "iid"),
    family = "nbinomial",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE, dic = TRUE)
)
summary(m.nbinom)
Time used:
    Pre = 0.459, Running = 0.267, Post = 0.0629, Total = 0.788 
Fixed effects:
              mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)  2.036 0.158      1.721    2.037      2.344  2.037   0
altitude_s  -1.412 0.164     -1.737   -1.412     -1.092 -1.412   0

Random effects:
  Name    Model
    year IID model
   pop_id IID model

Model hyperparameters:
                                                        mean    sd 0.025quant
size for the nbinomial observations (1/overdispersion)  2.18 0.185      1.836
Precision for year                                     18.59 8.697      7.432
Precision for pop_id                                    1.46 0.411      0.806
                                                       0.5quant 0.975quant
size for the nbinomial observations (1/overdispersion)     2.17       2.56
Precision for year                                        16.73      40.80
Precision for pop_id                                       1.41       2.41
                                                        mode
size for the nbinomial observations (1/overdispersion)  2.16
Precision for year                                     13.62
Precision for pop_id                                    1.31

Deviance Information Criterion (DIC) ...............: 3440.05
Deviance Information Criterion (DIC, saturated) ....: 622.13
Effective number of parameters .....................: 53.89

Watanabe-Akaike information criterion (WAIC) ...: 3443.63
Effective number of parameters .................: 50.78

Marginal log-Likelihood:  -1783.55 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

Let’s plot the model prediction against actual data.

m.nbinom.samples <- inla.posterior.sample(1000, m.nbinom)

alt_vals <- seq(-1, 3, length.out = 100)
y.sample <- inla.posterior.sample.eval(
    fun = function(...) {
        mu <- exp(Intercept + altitude_s * alt_vals)
        abundance <- rnbinom(length(mu), mu = mu, size = theta[1])
    },
    samples = m.nbinom.samples
)

y.sum <- apply(y.sample, 1, function(x) {
    y.hdi <- hdi(x, ci = 0.89)
    c(mean = mean(x), hdi_low = y.hdi$CI_low, hdi_high = y.hdi$CI_high)
})
data.sim <- data.frame(
    altitude_s = alt_vals,
    t(y.sum)
)

ggplot(data.sim, aes(x = altitude_s, y = mean)) +
    geom_point(data = data.avg, aes(x = altitude_s, y = abundance_avg), colour = "grey80") +
    geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
    geom_line(colour = "steelblue") +
    labs(
        x = "Scaled altitude",
        y = "Eel abundance"
    )

We see that the negative binomial distribution better account for the variance. We will stick to this distribution for now, although we could have tried other distributions. Notably, the ZIP-poisson could be useful when we have many 0s.

4.2. Modeling time

Let’s now focus on modelling the effect of time. To begin with, let’s inspect the raw data.

ggplot(data, aes(x = year, y = abundance, color = altitude_s)) +
    geom_jitter(width = 0.2) +
    stat_summary(fun = mean, geom = "line", colour = "grey", size = 1) +
    labs(x = "Year", ylab = "Eel abundance", color = "Altitude (scaled)")

The plain line indicates the trend of the mean abundance over all sites. This trend is decreasing, we expect to retrieve this trend in our temporal random effects.

4.2.1. IID random effect

We begin by assuming that year random effects are independent from one another. We expect this assumption to be wrong, but let’s start from there and then explore more realistic modeling choices.

m.iid <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "iid") +
        f(pop_id, model = "iid"),
    family = "nbinomial",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE)
)
re.year.iid <- m.iid$summary.random$year

Here was our first try, yet we expect some correlation from one year to another. For this reason, time random effect are often modelled with autoregressive model (AR1 if lag 1).

4.2.2. AR1 Random effect

We modify the previous model changing the year random effect from IID to AR1.

m.ar <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "ar1") +
        f(pop_id, model = "iid"),
    family = "nbinomial",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE)
)
re.year.ar <- m.ar$summary.random$year

4.2.3. RW1 random effect

Same with RW1 random effect.

m.rw <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "rw1") +
        f(pop_id, model = "iid"),
    family = "nbinomial",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE)
)
re.year.rw <- m.rw$summary.random$year

4.2.4. Model comparisons

Now that we have the three models, we can compare them.

First, we can plot random effect vs year.

re.year.iid$type <- "iid"
re.year.ar$type <- "ar"
re.year.rw$type <- "rw"
re.year.all <- rbind(re.year.iid, re.year.ar, re.year.rw)

ggplot(re.year.all, aes(x = ID, y = mean, color = type)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
    geom_ribbon(aes(ymin = `0.025quant`, ymax = `0.975quant`), alpha = 0.1) +
    geom_line() +
    geom_point() +
    labs(
        x = "Year",
        y = "Year random effect",
    )

We see that AR1 and RW1 produce random effects with a smooth trend. By contrast, IID random effects produces ‘chaotic’ random effects due to their independence. Second, we can inspect model WAIC, where lower values implies better out-of-sample predictive performance.

m.iid$waic$waic
[1] 3443.628
m.ar$waic$waic
[1] 3428.846
m.rw$waic$waic
[1] 3428.494

We see that AR1 and RW1 have lower WAIC signaling better predictive ability, as we could have expected.

We can also do a predictive check by plotting predicted trends against the observed one.

pred_iid <- tapply(m.iid$summary.fitted.values$mean, data$year, mean)
pred_ar1 <- tapply(m.ar$summary.fitted.values$mean, data$year, mean)
pred_rw1 <- tapply(m.rw$summary.fitted.values$mean, data$year, mean)

obs <- tapply(data$abundance, data$year, mean)

df <- data.frame(
    year = sort(unique(data$year)),
    obs = obs,
    iid = pred_iid,
    ar1 = pred_ar1,
    rw1 = pred_rw1
)
df <- df |> pivot_longer(
    cols = c("obs", "iid", "ar1", "rw1"),
    names_to = "type",
    values_to = "mean_abundance"
)

ggplot(df, aes(x = year, y = mean_abundance, color = type)) +
    geom_line() +
    labs(
        y = "Mean abundance",
        x = "Year"
    )

Here, we see that there is no strong differences: all three models track well the observed trend. The IID model might be a bit overfitting the trend at the beginning, but otherwise there is not much to say.

4.3. Modeling space

Now that we have seen the different option to model time dynamics, we will focus on space.

4.3.1. IID random effect

First, we begin with IID random effects, assuming site are independent from one another. This assumption is probably unrealistic but still provide a useful baseline.

We fit the model, extract spatial random effects and join them to the dataframe containing their geographical positions.

m.iid <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "ar1") +
        f(pop_id, model = "iid"),
    family = "nbinomial",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE)
)

re.site.iid <- m.iid$summary.random$pop_id

pts.re.iid <- pts_vilaine |> left_join(
    re.site.iid |> rename(pop_id = "ID"),
    by = "pop_id"
)

Next, we can plot the random effect on a map, to check for spatial correlation.

ggplot(pts.re.iid) +
    geom_sf(aes(color = mean), size = 2) +
    scale_color_gradient2(
        low = "blue",
        mid = "white",
        high = "red",
        midpoint = 0
    ) +
    labs(
        color = "RE site",
    )

We do see a clear spatial correlation, from west to east going from negative to positive random effects. This spatial correlation in the residuals motivates the use of spatially structured residuals. We explore the different options to do so in the following subsections.

4.3.2. SPDE random effect

First step is to get site coordinates in meters, because angles distort distances between sites, which is going to bias our spatial model.

pts_vilaine.proj <- st_transform(pts_vilaine, 2154) # Coordinates in meters.
coords <- st_coordinates(pts_vilaine.proj)

From these coordinates we can build the mesh using fmesher and visualise it.

library(fmesher)
mesh <- fm_mesh_2d_inla(loc = coords)
plot(mesh)
points(coords, col = "red", pch = 16, cex = 2)

Note

The mesh can be customised to increase or decrease its resolution using additional parameters. For our simple case study, we won’t focus on these details.

Next, we define the SPDE model and the projection matrix \(A\). \(A\) projects the spatial random field to actual observation locations.

spde <- inla.spde2.matern(mesh)
A <- inla.spde.make.A(mesh = mesh, loc = coords)
spatial_index <- inla.spde.make.index(
    name = "spatial.field",
    n.spde = spde$n.spde
)
dim(A)
[1] 37 79
nrow(mesh$loc)
[1] 79

We see that \(A\) is of dimension (number of sites x number of mesh knots).

Because we have multiple observations per sites we have to expand A so the spatial random field is projected for each observations (and not locations). To do so, we have to match each observation to its location.

site_idx <- match(data$pop_id, pts_vilaine$pop_id)
A_expanded <- A[site_idx, ]
dim(A_expanded)
[1] 526  79
nrow(data)
[1] 526

Next, we can build the data stack which combines: data, effects (i.e. predictors) and observation matrices.

# Prepare the data stack
data_stack <- inla.stack(
    data = list(y = data$abundance),
    A = list(A_expanded, 1),
    effects = list(
        spatial.field = spatial_index,
        data.frame(
            intercept = rep(1, nrow(data)),
            altitude_s = data$altitude_s,
            year = data$year,
            pop_id = data$pop_id
        )
    ),
    tag = "est"
)

data_stack
Data stack with
  data:    (y), size: 526
  effects: (spatial.field, spatial.field.group, spatial.field.repl, intercept, altitude_s, year, pop_id), size: 563
  A:       526 times 563

Now everything is (finally) ready to fit the model.

m.spde <- inla(
    y ~ -1 + intercept + altitude_s +
        f(year, model = "ar1") +
        f(spatial.field, model = spde),
    family = "nbinomial",
    data = inla.stack.data(data_stack),
    control.predictor = list(A = inla.stack.A(data_stack)),
    control.compute = list(config = TRUE, waic = TRUE)
)

summary(m.spde)
Time used:
    Pre = 0.754, Running = 0.498, Post = 0.105, Total = 1.36 
Fixed effects:
             mean    sd 0.025quant 0.5quant 0.975quant   mode kld
intercept   2.056 0.313      1.433    2.055      2.692  2.055   0
altitude_s -1.396 0.229     -1.848   -1.397     -0.941 -1.397   0

Random effects:
  Name    Model
    year AR1 model
   spatial.field SPDE2 model

Model hyperparameters:
                                                        mean    sd 0.025quant
size for the nbinomial observations (1/overdispersion)  2.21 0.185      1.867
Precision for year                                     14.05 7.264      4.333
Rho for year                                            0.77 0.118      0.489
Theta1 for spatial.field                                7.51 0.397      6.691
Theta2 for spatial.field                               -8.77 0.321     -9.373
                                                       0.5quant 0.975quant
size for the nbinomial observations (1/overdispersion)    2.203      2.597
Precision for year                                       12.593     32.131
Rho for year                                              0.791      0.939
Theta1 for spatial.field                                  7.526      8.253
Theta2 for spatial.field                                 -8.783     -8.111
                                                         mode
size for the nbinomial observations (1/overdispersion)  2.189
Precision for year                                      9.889
Rho for year                                            0.837
Theta1 for spatial.field                                7.587
Theta2 for spatial.field                               -8.829

Watanabe-Akaike information criterion (WAIC) ...: 3431.84
Effective number of parameters .................: 45.48

Marginal log-Likelihood:  -1778.47 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

First, let’s compare the WAIC of the SPDE model with the IID one.

m.iid$waic$waic
[1] 3428.852
m.spde$waic$waic
[1] 3431.841

We see that they perform similarly in terms of predictive performance. This inspection doesn’t indicate that a model is clearly better than another.

Important

WAIC is a measure of out-of-sample predictive performance. A model can be wrong (in the sense of causal structure) and predict well, while a model can be right and predict poorly. It is important to keep this fact in mind while comparing modelling performance with WAIC.

To go further, we extract spatial random effects and want to visual them.

re.site.spde <- m.spde$summary.random$spatial.field
proj <- fm_evaluator(mesh)

# Get spatial field mean and sd
spatial_mean <- proj$proj$A %*% re.site.spde$mean
spatial_sd <- proj$proj$A %*% re.site.spde$sd

# Create data frame for ggplot
spatial_proj <- data.frame(
    x = rep(proj$x, length(proj$y)),
    y = rep(proj$y, each = length(proj$x)),
    mean = as.vector(spatial_mean),
    sd = as.vector(spatial_sd)
)

We also extract the edges of the river network to plot it on the subsequent figure.

edges_sf <- net %>%
    activate("edges") %>%
    st_as_sf()
edges_sf <- st_transform(edges_sf, st_crs(pts_vilaine.proj))

Visualize the spatial random effects on a map.

ggplot(spatial_proj, aes(x = x, y = y, fill = mean)) +
    geom_raster() +
    scale_fill_gradient2(
        low = "blue",
        mid = "white",
        high = "red",
        midpoint = 0
    ) +
    geom_sf(
        data = edges_sf,
        inherit.aes = FALSE,
        size = 0.3,
        alpha = 0.3
    ) +
    geom_sf(
        data = pts_vilaine.proj,
        aes(geometry = geometry),
        inherit.aes = FALSE
    ) +
    labs(
        fill = "SPDE RE",
        x = "",
        y = ""
    )

As well as their uncertainty.

ggplot(spatial_proj, aes(x = x, y = y, fill = sd)) +
    geom_raster() +
    geom_sf(data = edges_sf, inherit.aes = FALSE, color = "black", size = 0.3) +
    geom_sf(
        data = pts_vilaine.proj,
        aes(geometry = geometry),
        inherit.aes = FALSE
    ) +
    labs(
        fill = "SD RE",
        x = "",
        y = ""
    )

As expected we see that uncertainty is lower near sites, and increases as we go further from them.

We see from plot above that SPDE makes sense for continuous process in space, as it can interpolate anywhere in the plane based on some assumed correlation structure. However, we know that eel do not live on land and therefore that their abundane is highly discontinuous. Some methods have adpated the SPDE framework to account for spatial barriers. Yet, this method doesn’t seem suited for river networks as we would have mostly barrier with a very complex topology. So let’s try something else.

4.3.3. Besag random effect

TODO